% Tutorial 5: Analytics
% R Bootcamp HTML Slides
% Jared Knowles
In this lesson we hope to learn:
In this tutorial we will use a number of datasets of different types:
stulong: student-level assessment and demographics data (simulated and research ready)midwest_schools.csv: aggregate school level test score averages from a large Midwest stateload("data/midwest_schools.rda")
head(midsch[, 1:12])
## district_id school_id subject grade n1 ss1 n2 ss2 predicted
## 1 14 130 math 4 44 433.1 40 463.0 468.7
## 2 70 20 math 4 18 443.0 20 477.2 476.5
## 3 112 80 math 4 86 445.4 94 472.6 478.4
## 4 119 50 math 4 95 427.1 94 460.7 464.1
## 5 147 60 math 4 27 424.2 27 458.7 461.8
## 6 147 125 math 4 17 423.5 26 463.1 461.2
## residuals resid_z resid_t
## 1 -5.7446 -0.59190 -0.59171
## 2 0.7235 0.07456 0.07452
## 3 -5.7509 -0.59267 -0.59248
## 4 -3.3586 -0.34606 -0.34591
## 5 -3.0937 -0.31877 -0.31863
## 6 1.8530 0.19094 0.19085
table(midsch$test_year, midsch$grade)
##
## 4 5 6 7 8
## 2007 1150 1094 472 638 734
## 2008 1204 1146 462 588 692
## 2009 1173 1092 434 592 668
## 2010 1120 1090 428 610 686
## 2011 1126 1060 420 618 688
length(unique(midsch$district_id))
## [1] 357
length(unique(midsch$school_id))
## [1] 247
table(midsch$subject, midsch$grade)
##
## 4 5 6 7 8
## math 2886 2741 1108 1523 1734
## read 2887 2741 1108 1523 1734
table(midsch$district_id,midsch$grade)library(ggplot2)
qplot(ss1, ss2, data = midsch, alpha = I(0.07)) + theme_dpi() + geom_smooth() +
geom_smooth(method = "lm", se = FALSE, color = "purple")
test_year, grade, and subject?nrow(unique(midsch[, c(3, 4, 14)]))
## [1] 50
5th grade, 2011, math scores
midsch_sub <- subset(midsch, midsch$grade == 5 & midsch$test_year == 2011 &
midsch$subject == "math")
midsch_sub?my_mod<-lm(ss2~ss1,data=midsch_sub)
lm function~ character divides the dependent variable ss2 from the independent variable ss1my_mod<-data means we don't have to write: lm(midsch_sub$ss2~midsch_sub$ss1)ss_mod <- lm(ss2 ~ ss1, data = midsch_sub)
summary(ss_mod)
##
## Call:
## lm(formula = ss2 ~ ss1, data = midsch_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.36 -7.60 -0.42 6.49 58.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.1687 11.3446 -0.46 0.65
## ss1 1.0644 0.0242 44.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.2 on 528 degrees of freedom
## Multiple R-squared: 0.786, Adjusted R-squared: 0.785
## F-statistic: 1.94e+03 on 1 and 528 DF, p-value: <2e-16
##
objects(ss_mod)
## [1] "assign" "call" "coefficients" "df.residual"
## [5] "effects" "fitted.values" "model" "qr"
## [9] "rank" "residuals" "terms" "xlevels"
coefficients fitted.values and callqplot(n2, ss2 - ss1, data = midsch, alpha = I(0.1)) + theme_dpi() + geom_smooth()
ssN1_mod <- lm(ss2 ~ ss1 + n1, data = midsch_sub)
summary(ssN1_mod)
##
## Call:
## lm(formula = ss2 ~ ss1 + n1, data = midsch_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.39 -7.73 -0.52 6.42 59.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6849 11.7688 0.14 0.886
## ss1 1.0450 0.0258 40.49 <2e-16 ***
## n1 0.0406 0.0193 2.10 0.036 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.2 on 527 degrees of freedom
## Multiple R-squared: 0.787, Adjusted R-squared: 0.787
## F-statistic: 976 on 2 and 527 DF, p-value: <2e-16
##
ssN2_mod <- lm(ss2 ~ ss1 + n2, data = midsch_sub)
summary(ssN2_mod)
##
## Call:
## lm(formula = ss2 ~ ss1 + n2, data = midsch_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.60 -7.62 -0.53 6.52 59.64
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7971 11.8544 0.15 0.88
## ss1 1.0450 0.0260 40.12 <2e-16 ***
## n2 0.0377 0.0192 1.97 0.05 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.2 on 527 degrees of freedom
## Multiple R-squared: 0.787, Adjusted R-squared: 0.786
## F-statistic: 975 on 2 and 527 DF, p-value: <2e-16
##
anova(ss_mod, ssN1_mod, ssN2_mod)
## Analysis of Variance Table
##
## Model 1: ss2 ~ ss1
## Model 2: ss2 ~ ss1 + n1
## Model 3: ss2 ~ ss1 + n2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 528 66239
## 2 527 65688 1 551 4.42 0.036 *
## 3 527 65755 0 -67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(ssN2_mod)
## [1] 4067
AIC(ssN1_mod)
## [1] 4067
n1 and n2 but either improves model fit over the model without itlibrary(lmtest)
resettest(ss_mod, power = 2:4)
##
## RESET test
##
## data: ss_mod
## RESET = 2.642, df1 = 3, df2 = 525, p-value = 0.04866
##
raintest(ss2 ~ ss1, fraction = 0.5, order.by = ~ss1, data = midsch_sub)
##
## Rainbow test
##
## data: ss2 ~ ss1
## Rain = 1.402, df1 = 265, df2 = 263, p-value = 0.003105
##
harvtest(ss2 ~ ss1, order.by = ~ss1, data = midsch_sub)
##
## Harvey-Collier test
##
## data: ss2 ~ ss1
## HC = 2.734, df = 527, p-value = 0.006462
##
ss_poly <- lm(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4), data = midsch_sub)
summary(ss_poly)
##
## Call:
## lm(formula = ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4), data = midsch_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.89 -6.92 -0.20 6.76 59.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.61e+03 3.61e+04 0.07 0.94
## ss1 -8.72e+00 3.18e+02 -0.03 0.98
## I(ss1^2) -9.51e-03 1.05e+00 -0.01 0.99
## I(ss1^3) 7.21e-05 1.54e-03 0.05 0.96
## I(ss1^4) -6.98e-08 8.42e-07 -0.08 0.93
##
## Residual standard error: 11.1 on 525 degrees of freedom
## Multiple R-squared: 0.789, Adjusted R-squared: 0.787
## F-statistic: 490 on 4 and 525 DF, p-value: <2e-16
##
anova(ss_mod, ss_poly)
## Analysis of Variance Table
##
## Model 1: ss2 ~ ss1
## Model 2: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 528 66239
## 2 525 65253 3 985 2.64 0.049 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resettest(ss_poly, power = 2:4)
##
## RESET test
##
## data: ss_poly
## RESET = 1.562, df1 = 3, df2 = 522, p-value = 0.1976
##
raintest(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4), fraction = 0.5, order.by = ~ss1,
data = midsch_sub)
##
## Rainbow test
##
## data: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4)
## Rain = 1.392, df1 = 265, df2 = 260, p-value = 0.003804
##
harvtest(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4), order.by = ~ss1, data = midsch_sub)
##
## Harvey-Collier test
##
## data: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4)
## HC = NaN, df = 524, p-value = NA
##
ss_polyn <- lm(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, data = midsch_sub)
anova(ss_mod, ssN2_mod, ss_poly, ss_polyn)
## Analysis of Variance Table
##
## Model 1: ss2 ~ ss1
## Model 2: ss2 ~ ss1 + n2
## Model 3: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4)
## Model 4: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 528 66239
## 2 527 65755 1 483 3.91 0.049 *
## 3 525 65253 2 502 2.03 0.133
## 4 524 64842 1 411 3.32 0.069 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resettest(ss_polyn, power = 2:4)
##
## RESET test
##
## data: ss_polyn
## RESET = 2.485, df1 = 3, df2 = 521, p-value = 0.05991
##
raintest(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, fraction = 0.5, order.by = ~ss1,
data = midsch_sub)
##
## Rainbow test
##
## data: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2
## Rain = 1.381, df1 = 265, df2 = 259, p-value = 0.004606
##
harvtest(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, order.by = ~ss1, data = midsch_sub)
##
## Harvey-Collier test
##
## data: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2
## HC = NA, df = 523, p-value = NA
##
library(quantreg)
ss_quant <- rq(ss2 ~ ss1, tau = c(seq(0.1, 0.9, 0.1)), data = midsch_sub)
plot(summary(ss_quant, se = "boot", method = "wild"))
ss_quant shows that in the lower quantiles the coefficients for the intercept and ss1 fall outside the confidence interval around the base coefficientss_quant2 <- rq(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, tau = c(seq(0.1,
0.9, 0.1)), data = midsch_sub)
plot(summary(ss_quant2, se = "boot", method = "wild"))
ss_quant3 <- rq(ss2 ~ ss1, tau = -1, data = midsch_sub)
qplot(ss_quant3$sol[1, ], ss_quant3$sol[5, ], geom = "line", main = "Continuous Quantiles") +
theme_dpi() + xlab("Quantile") + ylab(expression(beta)) + geom_hline(yintercept = coef(summary(ss_mod))[2,
1]) + geom_hline(yintercept = coef(summary(ss_mod))[2, 1] + (2 * coef(summary(ss_mod))[2,
2]), linetype = 3) + geom_hline(yintercept = coef(summary(ss_mod))[2, 1] -
(2 * coef(summary(ss_mod))[2, 2]), linetype = 3)
ss_quant4 <- rq(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, tau = -1, data = midsch_sub)
qplot(ss_quant4$sol[1, ], ss_quant4$sol[5, ], geom = "line", main = "Continuous Quantiles") +
theme_dpi() + xlab("Quantile") + ylab(expression(beta)) + geom_hline(yintercept = coef(summary(ss_mod))[2,
1]) + geom_hline(yintercept = coef(summary(ss_mod))[2, 1] + (2 * coef(summary(ss_mod))[2,
2]), linetype = 3) + geom_hline(yintercept = coef(summary(ss_mod))[2, 1] -
(2 * coef(summary(ss_mod))[2, 2]), linetype = 3)
dlplylibrary(plyr)
midsch$id <- interaction(midsch$test_year, midsch$grade, midsch$subject)
mods <- dlply(midsch, .(id), lm, formula = ss2 ~ ss1)
objects(mods)[1:10]
## [1] "2007.4.math" "2007.4.read" "2007.5.math" "2007.5.read" "2007.6.math"
## [6] "2007.6.read" "2007.7.math" "2007.7.read" "2007.8.math" "2007.8.read"
mytest <- llply(mods, function(x) resettest(x, power = 2:4))
mytest[[1]]
##
## RESET test
##
## data: x
## RESET = 2.499, df1 = 3, df2 = 570, p-value = 0.05876
##
mytest[[2]]
##
## RESET test
##
## data: x
## RESET = 0.8864, df1 = 3, df2 = 597, p-value = 0.4478
##
a1 <- qplot(id, residmean, data = ddply(midsch, .(id), summarize, residmean = mean(residuals)),
geom = "bar", main = "Provided Residuals") + theme_dpi() + opts(axis.text.x = theme_blank(),
axis.ticks = theme_blank()) + ylab("Mean of Residuals") + xlab("Model") +
geom_text(aes(x = 12, y = 0.3), label = "SD of Residuals = 9")
a2 <- qplot(id, V1, data = ldply(mods, function(x) mean(x$residuals)), geom = "bar",
main = "Replication Models") + theme_dpi() + opts(axis.text.x = theme_blank(),
axis.ticks = theme_blank()) + ylab("Mean of Residuals") + xlab("Model") +
geom_text(aes(x = 7, y = 0.3), label = paste("SD =", round(mean(ldply(mods,
function(x) sd(x$residuals))$V1), 2)))
grid.arrange(a1, a2, main = "Comparing Replication and Provided Residual Means by Model")
qplot(residuals, data = midsch, geom = "density") + stat_function(fun = dnorm,
aes(colour = "Normal")) + geom_histogram(aes(y = ..density..), alpha = I(0.4)) +
geom_line(aes(y = ..density.., colour = "Empirical"), stat = "density") +
scale_colour_manual(name = "Density", values = c("red", "blue")) + opts(legend.position = c(0.85,
0.85)) + theme_dpi()
b <- 2 * rnorm(5000)
c <- b + runif(5000)
dem <- lm(c ~ b)
a1 <- qplot(midsch$ss1, abs(midsch$residuals), main = "Residual Plot of Replication Data",
geom = "point", alpha = I(0.1)) + geom_smooth(method = "lm", se = TRUE) +
xlab("SS1") + ylab("Residuals") + geom_smooth(se = FALSE) + ylim(c(0, 50)) +
theme_dpi()
a2 <- qplot(b, abs(lm(c ~ b)$residuals), main = "Well Specified OLS", alpha = I(0.3)) +
theme_dpi() + geom_smooth()
grid.arrange(a1, a2, ncol = 2)
bptest(ss_mod)
##
## studentized Breusch-Pagan test
##
## data: ss_mod
## BP = 7.499, df = 1, p-value = 0.006172
##
gqtest(ss_mod)
##
## Goldfeld-Quandt test
##
## data: ss_mod
## GQ = 0.8302, df1 = 263, df2 = 263, p-value = 0.9341
##
lm objectmy_megamod <- lm(ss2 ~ ss1 + grade + test_year + subject, data = midsch)
summary(my_megamod)
##
## Call:
## lm(formula = ss2 ~ ss1 + grade + test_year + subject, data = midsch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.58 -6.38 0.69 6.93 62.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 415.92105 108.01823 3.85 0.00012 ***
## ss1 0.89548 0.00321 278.85 < 2e-16 ***
## grade -0.72909 0.08014 -9.10 < 2e-16 ***
## test_year -0.16754 0.05380 -3.11 0.00185 **
## subjectread -11.53144 0.15245 -75.64 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 19980 degrees of freedom
## Multiple R-squared: 0.9, Adjusted R-squared: 0.9
## F-statistic: 4.5e+04 on 4 and 19980 DF, p-value: <2e-16
##
my_megamod2 <- lm(ss2 ~ ss1 + as.factor(grade) + as.factor(test_year) + subject,
data = midsch)
summary(my_megamod2)
##
## Call:
## lm(formula = ss2 ~ ss1 + as.factor(grade) + as.factor(test_year) +
## subject, data = midsch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77.43 -5.78 0.36 6.18 60.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.93813 1.35590 53.79 < 2e-16 ***
## ss1 0.91197 0.00306 298.17 < 2e-16 ***
## as.factor(grade)5 -8.39756 0.20701 -40.57 < 2e-16 ***
## as.factor(grade)6 -0.69535 0.27917 -2.49 0.013 *
## as.factor(grade)7 -2.92812 0.29120 -10.06 < 2e-16 ***
## as.factor(grade)8 -7.64546 0.32318 -23.66 < 2e-16 ***
## as.factor(test_year)2008 -3.08623 0.22493 -13.72 < 2e-16 ***
## as.factor(test_year)2009 -0.46178 0.22667 -2.04 0.042 *
## as.factor(test_year)2010 -1.86967 0.22716 -8.23 < 2e-16 ***
## as.factor(test_year)2011 -1.49652 0.22769 -6.57 5.1e-11 ***
## subjectread -11.59171 0.14416 -80.41 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 19974 degrees of freedom
## Multiple R-squared: 0.911, Adjusted R-squared: 0.911
## F-statistic: 2.04e+04 on 10 and 19974 DF, p-value: <2e-16
##
It is good to include the session info, e.g. this document is produced with knitr version 0.7. Here is my session info:
print(sessionInfo(), locale = FALSE)
## R version 2.15.1 (2012-06-22)
## Platform: x86_64-pc-mingw32/x64 (64-bit)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] plyr_1.7.1 quantreg_4.81 SparseM_0.96 lmtest_0.9-30 zoo_1.7-7
## [6] mgcv_1.7-19 ggplot2_0.9.1 gridExtra_0.9 knitr_0.7
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.1-1 dichromat_1.2-4 digest_0.5.2
## [4] evaluate_0.4.2 formatR_0.6 labeling_0.1
## [7] lattice_0.20-6 MASS_7.3-19 Matrix_1.0-6
## [10] memoise_0.1 munsell_0.3 nlme_3.1-104
## [13] proto_0.3-9.2 RColorBrewer_1.0-5 reshape2_1.2.1
## [16] scales_0.2.1 stringr_0.6 tools_2.15.1